library(readr)
library(reshape2)
library(dplyr)
df1 <- read_csv(
"C:/Personal/71099942/Documents/GitHub/Data_Viz_course_materials/course_materials/Exercises/03_olympics/athletes_and_events.csv", col_names = T)
df2 <- read_csv(
"C:/Personal/71099942/Documents/GitHub/Data_Viz_course_materials/course_materials/Exercises/03_olympics/gdp_pop.csv", col_names = T)
df3 <- read_csv(
"C:/Personal/71099942/Documents/GitHub/Data_Viz_course_materials/course_materials/Exercises/03_olympics/noc_regions.csv", col_names = T)
df1$area <- ifelse(df1$NOC == "GDR"
| df1$NOC == "FRG",
"GER", df1$NOC)
df1$area <- ifelse(df1$area == "URS",
"RUS", df1$area)
df1$area <- ifelse(df1$area == "YUG",
"SRB", df1$area)
df <- merge(df1, df2, by.x = "area", by.y = "Code", all.x = F, all.y = T)
df <- merge(df, df3, by.x = "NOC", by.y = "NOC", all.x = F, all.y = T)
df$medal_rank <- ifelse(df$Medal == "Bronze", 1, df$Medal)
df$medal_rank <- ifelse(df$medal_rank == "Silver", 2, df$medal_rank)
df$medal_rank <- ifelse(df$medal_rank == "Gold", 3, df$medal_rank)
df$medal_any <- ifelse(is.na(df$Medal), 0, 1)
summer_country <- df %>%
group_by(region, Season) %>%
summarise(
count_medals = sum(medal_any, na.rm = TRUE)
) %>%
filter(Season == "Summer") %>%
arrange(desc(count_medals))
summer_medals <- df %>%
group_by(region, Season, Medal) %>%
summarise(
count_medals = sum(medal_any, na.rm = TRUE)
) %>%
filter(Season == "Summer", !is.na(Medal))
head(summer_medals)
## # A tibble: 6 x 4
## # Groups: region, Season [3]
## region Season Medal count_medals
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan Summer Bronze 2
## 2 Algeria Summer Bronze 8
## 3 Algeria Summer Gold 5
## 4 Algeria Summer Silver 4
## 5 Argentina Summer Bronze 91
## 6 Argentina Summer Gold 91
summer_gold_super_powers <- df %>%
group_by(region, Season, Year, Medal) %>%
summarise(
count_medals = sum(medal_any, na.rm = TRUE),
count_athlets = n()
) %>%
filter(Season == "Summer", Medal == "Gold",
region == "USA"
| region == "Russia"
| region == "China")
library(ggplot2)
library(ggthemes)
g1 <- ggplot(summer_gold_super_powers, aes(x = Year, y = count_medals)) +
geom_line(aes(color = region)) +
geom_point(aes(color = region), size=3) +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "Gold Medals in Summer Olympics",
subtitle = "Total Count by Region*",
caption = "*Russia represented by the USSR before 1991")
# + geom_hline(yintercept = c(5,6), lty=2) +
# scale_y_continuous(breaks=seq(1, 6, 1),
# label=sprintf("$%s",seq(300,400,20))) +
# scale_x_continuous(breaks=x,label=x) +
# annotate("text", x = c(2012), y = c(10),
# adj=1, family="serif", label = c(
# "Gold Medals / in Summer Olympics"))
g1
summer_medals$Medal <- factor(summer_medals$Medal, levels = c("Bronze", "Silver", "Gold"))
summer_medals %>%
filter(region == "USA"
| region == "Russia"
| region == "China") %>%
arrange(Medal) %>%
ggplot(aes(x = region, y = count_medals, fill = Medal)) +
geom_bar(stat = "identity") +
scale_fill_manual("Medal", values = c("Gold" = "gold", "Silver" = "grey", "Bronze" = "tan3")) +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
ggtitle("Total Amount of Medals in Summer Olympics")
The main focus of the plots was to compare the amount of medals the three world super-power have acquired in the summer Olympics. I would recommed using the first plot with the gold medal count overtime because it allows to add historical context so that the readers can learn relevant pieces of history that impacted participation and the amount of medals earned. For example, it becomes clear from the plot that China has not started participating in the summer games until 1984 which gives a great way to insert history facts into the story.
gold_super_powers <- df %>%
filter(Year >= 2000) %>%
group_by(region, Season, Medal) %>%
summarise(
count_medals = sum(medal_any, na.rm = TRUE),
count_athlets = n()
) %>%
filter(Medal == "Gold",
region == "USA"
| region == "Russia"
| region == "China")
df4 <- df[ , c(20, 18, 19)]
gold_super_powers <- merge(gold_super_powers, df4,
by.x = "region", by.y = "region",
all.x = T, all.y = F)
gold_super_powers <- gold_super_powers[!duplicated(gold_super_powers), ]
gold_super_powers <- gold_super_powers[complete.cases(gold_super_powers), ]
gold_super_powers <- gold_super_powers[c(-2, -4), ]
gold_super_powers$pop.adg <- gold_super_powers$count_medals/gold_super_powers$Population
gold_super_powers$gdp.adg <- gold_super_powers$`GDP per Capita` / gold_super_powers$count_medals
gold_super_powers <- rename(gold_super_powers, GDP_per_capita = "GDP per Capita")
ggplot(gold_super_powers, aes(fill=Season, y=count_medals, x=region)) +
geom_bar(stat="identity") +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "USA gets by far the most Olympic gold...",
subtitle = "Total Count of Gold medals since 2000")
ggplot(gold_super_powers, aes(fill=Season, y=pop.adg, x=region)) +
geom_bar(stat="identity") +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "but Russia's population is slightly more athletically talented",
subtitle = "...mostly due to Winter Olympic advantage ",
caption = "Olympic gold medals per capita")
library(ggrepel)
superpowers <- ggplot(gold_super_powers, aes(x = GDP_per_capita, y = count_medals, color = Season)) +
geom_point(size = 6) +
geom_text_repel(aes(label = region),
color = "gray20", size = 4, force = 10, check_overlap = T, nudge_x = 35, nudge_y = 35) +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "Superpowers with higher GDP per capita win more gold medals... \n but only for the Summer Olympics",
subtitle = "Olympic gold medals and GDP per capita")
superpowers
library(rvest)
library(stringr)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Summer_Olympic_Games")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[8]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ",")[,2]
hosts <- hosts[hosts$Year > 1944 & hosts$Year < 2020, ]
hosts$country <- ifelse(hosts$country == " Soviet Union", "Russia", hosts$country)
hosts$country <- ifelse(hosts$country == " West Germany", "Germany", hosts$country)
hosts$country <- trimws(hosts$country, which = c("both", "left", "right"))
adv <- df %>%
filter(Season == "Summer", Year > 1944 & Year < 2020) %>%
group_by(region, Season, Year, City, Medal) %>%
summarise(
count_medals = sum(medal_any, na.rm = TRUE),
) %>%
filter(Season == "Summer", !is.na(Medal))
adv$region <- ifelse(adv$region == "UK", "United Kingdom", adv$region)
adv$region <- ifelse(adv$region == "USA", "United States", adv$region)
countries <- hosts$country
adv <- adv[adv$region %in% countries, ]
adv$City <- ifelse(adv$City == "Athina", "Athens", adv$City)
adv$City <- ifelse(adv$City == "Moskva", "Moscow", adv$City)
adv$City <- ifelse(adv$City == "Roma", "Rome", adv$City)
adv <- merge(adv, hosts[ , c("city", "country")], by.x = "City", by.y = "city", all.x = T, all.y = F)
adv$host <- ifelse(adv$region == adv$country, "Host Advantage", "No Advantage")
adv <- adv[complete.cases(adv), ]
adv1 <- adv %>%
group_by(host, region) %>%
summarise(
avg_medals = mean(count_medals, na.rm = T)
)
adv1$avg_medals <- round(adv1$avg_medals, digits = 1)
adv1 %>% arrange(host, host, desc(avg_medals))
## # A tibble: 30 x 3
## # Groups: host [2]
## host region avg_medals
## <chr> <chr> <dbl>
## 1 Host Advantage Russia 147.
## 2 Host Advantage United States 102.
## 3 Host Advantage Germany 84.3
## 4 Host Advantage China 61.3
## 5 Host Advantage Australia 41.7
## 6 Host Advantage United Kingdom 31.2
## 7 Host Advantage Italy 29.3
## 8 Host Advantage South Korea 25.7
## 9 Host Advantage Spain 23
## 10 Host Advantage Japan 20.7
## # ... with 20 more rows
adv1$region <- factor(adv1$region, levels = rev(unique(adv1$region)))
host_adv <- ggplot(adv1, aes(fill= host, x = region , y = avg_medals)) +
geom_bar(position="dodge", stat="identity") +
coord_flip() +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "Host advantage is real!",
subtitle = "Average medals won in Summer Olympics since 1944")
host_adv
athlets <- df %>%
group_by(ID, Name, Sex, region, Sport) %>%
summarise(
medals = sum(medal_any)
) %>%
arrange(
desc(medals)
) %>%
filter(
medals > 11
)
athlets$Name <- factor(athlets$Name, levels = (rev(unique(athlets$Name))))
plot_athlets <- ggplot(athlets, aes(x = medals, y = Name, color = Sex, shape = Sport)) +
geom_point(size = 4) +
theme_tufte(base_size = 15) + theme(axis.title=element_blank()) +
labs(title = "Most successful \nOlympic athletes of all time",
subtitle = "Total medal count"
)
plot_athlets
athlets1 <- df %>%
group_by(ID, Name, Sex, region, Sport) %>%
summarise(
medals = sum(medal_any)
) %>%
arrange(
desc(medals)
) %>%
filter(
medals > 0
)
library(DT)
datatable(athlets1)
pretty_headers <-
gsub("[.]", " ", colnames(athlets1)) %>%
str_to_title()
athlets1 %>%
datatable(
rownames = FALSE,
colnames = pretty_headers,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:"))
)
In order to continue on the subject of the superpowers in the Olympic games, i would like to provide the list of the most successful athlets and the countries they represented to allow the users to see to what extent these countreis are represented by the superpowers, as well as the gender of the athlets and the sport they participated in.
library(plotly)
ggplotly(superpowers + theme(legend.title = element_blank()))
The interactive plots are useful to provide a reader with additional tools to explore and single out different dimensions data. This may be especially beneficial when there is a lot of data in a graph, such that interactive legened, for example, can provide a way to compare different categories and spot patterns.
ggplotly(host_adv)